
##################
## Solar Siting estimation for Texas (bootstraps)


require(data.table)
require(iterators)
require(foreach)
require(parallel)
require(doSNOW)
require(gghighlight)
require(gridExtra)
# require(gpuR)

WORK = file.path('Z:/user/ajk41/Solar siting/Scripts/JAERE Code Repository/Data JAERE')
WORK.OUT = file.path(WORK, 'intermediate_output')
dir.create(WORK.OUT)

#### Get data 
load(file.path(WORK.OUT, 'usingWorkspace.Rdata'))





#### For v5, reshape the df.split and df.common ####
df.useMYS = split(rbindlist(map(df.split.useMYS, function(x) {
  x$ORISPL = rownames(x)
  return(x)}), idcol = 'INTX'), by='ymn')

df.useMYS = lapply(df.useMYS, function(xx){
  list(df = df.split[xx$ORISPL],
              ym = xx$ym[1],
              ymn = xx$ymn[1])
})

#### jDecompose function ####
jDecompose <- function(x){
  if(class(x)[1]%in%c('data.frame','data.table')) x = x$ym
  x = as.character(x)
  y = strsplit(x = x, split="---")
  z = unlist(strsplit(x=y[[1]][2], split="--"))
  return(data.frame(intx = y[[1]][1],
                    ym = z, stringsAsFactors=F))
} # takes the long ym value, parses out the intx, year-mo for the ym




# Cool. Merge in National NSRDB solar insolation as SOL_XXXX --------------------------------------
WEST.names = gsub('PCA_', 'SOL_', unique(grep('PCA_', names(df.common.intx[['WEST']]), value=T)))
EAST.names = gsub('PCA_', 'SOL_', unique(grep('PCA_', names(df.common.intx[['EAST']]), value=T)))
TRE.names  = gsub('PCA_', 'SOL_', unique(grep('PCA_', names(df.common.intx[['TRE']] ), value=T)))

map.names = rbind(data.frame(INTX = 'W',NAME = WEST.names),
                  data.frame(INTX = 'E',NAME = EAST.names),
                  data.frame(INTX = 'T',NAME = TRE.names))
map.names$pcac = sapply(strsplit( as.character(map.names$NAME),'_[[:alnum:]]{1}'), '[', 2)


NN = readRDS(file.path(WORK, 'NSRDB_siting----2018-10-02.rds'))  # NSRDB requires manually querying and manually downloading NSRDB files and is not coded in R.
##--> some pcac (planning control area) mappings are combined below. This combines the solar for each subregion.
NN[pcac=='MROE',pcac:='MROW']
NN[pcac=='RFCM',pcac:='RFCW']
NN[pcac=='NEWE',pcac:='NYUP']
NN[pcac=='SPSO',pcac:='SPNO']
NN = merge(NN, map.names, by='pcac', all=F)
NNN = dcast(NN, formula = dh_e ~ NAME, fun.aggregate = sum,  value.var = 'solar_index', drop=T)


df.common.intx[['WEST']] = merge(df.common.intx[['WEST']], NNN[,grep('dh_e|_W', names(NNN), value=F), with=F], by = 'dh_e')
df.common.intx[['EAST']] = merge(df.common.intx[['EAST']], NNN[,grep('dh_e|_E', names(NNN), value=F), with=F], by = 'dh_e')
df.common.intx[['TRE']]  = merge(df.common.intx[['TRE']] , NNN[,grep('dh_e|_T', names(NNN), value=F), with=F], by = 'dh_e')

## 
# New: combine down high-variance eGrid subregions (solar and load)
df.common.intx[['EAST']][,c('PCA_EMROW'):=list(PCA_EMROE+PCA_EMROW)]
df.common.intx[['EAST']][,c('PCA_ERFCW'):=list(PCA_ERFCM+PCA_ERFCW)]
df.common.intx[['EAST']][,c('PCA_ENYUP'):=list(PCA_ENEWE+PCA_ENYUP)]
df.common.intx[['EAST']][,c('PCA_ESPNO'):=list(PCA_ESPSO+PCA_ESPNO)]
df.common.intx[['EAST']][,c('PCA_EMROE','PCA_ERFCM',
                            'PCA_ENEWE','PCA_ESPSO'):=NULL]





# Preparation for Bootstrapping -------------------------------------------
df.common.intx.M = df.common.intx['TRE']
df.useMYS.M = df.useMYS[grepl('TRE',names(df.useMYS))]





cl <- snow::makeCluster(rep('localhost',22), type='SOCK')
registerDoSNOW(cl)

Blist = as.list(rep(NA, 2000))
common.vars = grep('_TERCT', names(df.common.intx.M[[1]]), value=T)  #--> need these as I have to re-demean them after bootstrap draw.
split.vars = grep('_MASS', names(df.useMYS.M[[1]][[1]][[1]]), value=T)
for(B in 1:2000){
  print(B)
  
  
## Setting up bootstrap selection; take .M, make .B
  # for each ym in the data, draw over days with replacement?
  ## start with common, make index of days
if(B==2){
  df.common.intx.B = df.common.intx.M
  df.useMYS.B = df.useMYS.M
} else {
  df.common.intx.B = copy(df.common.intx.M)[[1]] %>%
    dplyr::mutate(OPR_DT:=date(dh_e)) %>%
    group_by(ym, OPR_DT) %>% nest() %>%
    group_by(ym) %>%
    slice(sample(NROW(.),size=NROW(.), replace=T)) %>%
    unnest() %>%
    dplyr::arrange(ym, dh_e) %>%
    setDT()
  
  
  df.common.intx.B[,(common.vars):=lapply(.SD, function(x) x - mean(x)),by=list(ym, hour, weekday), .SDcols = common.vars]
  df.common.index = df.common.intx.B[,.(dh_e)]
  
  df.useMYS.B = lapply(df.useMYS.M, function(TOP){
    TOP[['df']] = lapply(TOP[['df']], function(TOPdd){
                    TOPdd = TOPdd[match(df.common.index$dh_e[which(df.common.index$dh_e %in% TOPdd$dh_e)], TOPdd$dh_e),]
                    TOPdd[,(split.vars):=lapply(.SD, function(x) x - mean(x)),by = list(ym, hour(dh_e), weekday), .SDcols = split.vars]
                    TOPdd} )
    TOP
  })
  
  df.common.intx.B = list(TRE = df.common.intx.B)
} # end if B==2 (to get full sample estimate)
## Running bootstrap  (qr.SOL only)
  
RES = foreach(df.iter = iter(df.useMYS.B, by='cell'), .errorhandling = 'pass', .inorder = F,
              .packages = c('data.table'), 
              .noexport = c('qr','wc','wdfc','wdfc.mm.g','wdfc.mm','why','cf','coef.results','Xs'),
              .verbose=T) %dopar% {
  hasSOL = F  # update later with conditional 
  jd = jDecompose(df.iter[['ym']])



# With-SOL ----------------------------------------------------------------

  
# if(grepl('WEST', df.iter$ymn)){ 
  if(T==T){
  hasSOL = T
  
  wdfc = df.common.intx.B[[jd$intx[1]]][ym%in%jd$ym,][,SOL_WSPSO:=NULL]
  wdfc[is.na(wdfc)] = 0
  wdfc.PCAs = grep(names(wdfc), pattern='PCA_|SOL_', value=T)  #--> new v4: add "SOL_"

  wdfc[,(wdfc.PCAs):=lapply(.SD, function(y){ y - mean(y, na.rm=T)}), by = .(ym, hour, weekday), .SDcols = wdfc.PCAs]
  wdfc[,icpt:=1]
  wdfc = wdfc[,c('dh_e','hour','mo','icpt',wdfc.PCAs), with=F]
  
  # Check for within-group variance; can't invert a matrix where each within-group isn't zero, right?
  # Catalog all of the mo-hour-PCA combinations that have no variance and drop them from the
  # model matrix by name.
  wc = melt(wdfc[,lapply(.SD, var), by=.(mo, hour), .SDcols = wdfc.PCAs], id.vars = c('mo','hour'))[value<1,][,value:=NULL]  # value<1 usually, but dropping dh_e where solar has no var.
  wc[, dropvars:=paste0('as.factor(hour)',hour,':as.factor(mo)',mo,':',variable)]
  
  # Now, make the model matrix #
  PCAs.intx = paste0('as.factor(hour):as.factor(mo):',wdfc.PCAs)  # these are the coef. FE's
  reg.form = as.formula(paste(' ~',paste(PCAs.intx, collapse=' + ')))
  wdfc.mm = model.matrix(reg.form, data=wdfc)
  wdfc.mm = wdfc.mm[,setdiff(colnames(wdfc.mm), wc[,dropvars])]  # only keep the columns (intx vars) not in dropvars (those that have zero within-variance) )
  wdfc.mm = wdfc.mm[,apply(wdfc.mm, MAR=2, var)!=0]  #--> within some cluster-hour-month groupings, there is no variance
  #  dim(wdfc.mm)                                              ###---> hmm, is this an issue for demeanlist? dropping these? No...should be OK. just dropping cols, not row.
  
  if(dim(wdfc.mm)[[1]]<=dim(wdfc.mm)[[2]]) {
    print('Not enough obs'); stop}

  #  if('gpuR' %in% .packages(T)){
  if(exists('gpuMatrix')){
    gpu.data.type = 'float'
    if(dim(wdfc.mm)[[2]]<10000) gpu.data.type = 'double'
    cat(paste0('i=',' data type: ', gpu.data.type, '\n'))
    wdfc.mm.g = gpuMatrix(wdfc.mm, type=gpu.data.type)
    cat('gpuR used\n')
  } else {
    cat(paste0('i=', ' non-gpu data type\n'))
    wdfc.mm.g = as.matrix(wdfc.mm)
    cat('gpuR NOT used\n')
  }
  
  cat('qr with-SOL (WEST)\n')
  Xs = list(qr.SOL = qr(wdfc.mm.g))
  rm(wdfc.mm.g);rm(wdfc);rm(wdfc.mm)
  gc()
} # end ALL are T; all are "WEST'
  
cat('Finished QR on ');cat(df.iter$ymn);cat('\n')


coef.results = list()

for(a in 1:length(df.iter$df)){
  cat('ORISPL_');cat(df.iter$df[[a]]$ORISPL[1]);cat(' begins...\n')
    why = df.iter$df[[a]]
      Ys.vars = c('SO2_MASS','NOX_MASS','CO2_MASS','GLOAD_MASS')
      why[,(Ys.vars):=lapply(.SD, function(y){ y - mean(y, na.rm=T)}), by = .(ym), .SDcols = Ys.vars]

    for(ss in 1:length(Xs)){
      isSOL = names(Xs)[[ss]]
      cat(isSOL); cat('\n')
      cf = as.data.table(qr.coef(Xs[[ss]], as.matrix(why[,Ys.vars, with=F])),
                         keep.rownames = 'coefnames')

      cf[,c('hour','mo','PCA'):=tstrsplit(coefnames, ':')]
      cf[,hour:=tstrsplit(hour, ')', keep=2, type.convert=T)]
      cf[,  mo:=tstrsplit(mo  , ')', keep=2, type.convert=T)]
      cf[, c('ORISPL','coefnames','isSOL','ymn', 'intx'):=list(df.iter$df[[a]]$ORISPL[1],NULL, isSOL, df.iter$ymn, jd$intx[1])]
      cf = cf[order(mo, hour, PCA), .(ymn, intx, ORISPL, PCA, mo, hour, isSOL, SO2_MASS, NOX_MASS, CO2_MASS, GLOAD_MASS)]
      
    coef.results[[length(coef.results)+1]] = cf
    } # end ss loop (over noSOL/SOL)

if(a%%3==0) gc()
} # end a-loop over ORISPL
rm(Xs); gc()
return(rbindlist(coef.results, fill=T)) 
} # end dopar

Blist[[B]] = rbindlist(RES, fill=T)[,iterB:=B]

if(B==1) plot(Blist[[B]] %>% dplyr::filter(abs(GLOAD_MASS)<10) %>% dplyr::filter(hour>6 & hour<22) %>% group_by(mo, hour, iterB) %>%
                summarize(GLOAD_MASS = sum(GLOAD_MASS)) %>% ungroup() %>% dplyr::mutate(x = 1:n()) %>%
                dplyr::select(x, GLOAD_MASS), type='l', col = 'gray80', ylim = c(-20, 100))
if(B==2) lines(Blist[[B]] %>% dplyr::filter(abs(GLOAD_MASS)<10) %>% dplyr::filter(hour>6 & hour<22) %>% group_by(mo, hour, iterB) %>%
                summarize(GLOAD_MASS = sum(GLOAD_MASS)) %>% ungroup() %>% dplyr::mutate(x = 1:n()) %>%
                dplyr::select(x, GLOAD_MASS), type='l', col = 'red')
if(B>2) lines(Blist[[B]] %>% dplyr::filter(abs(GLOAD_MASS)<10) %>% dplyr::filter(hour>6 & hour<22) %>% group_by(mo, hour, iterB) %>%
                summarize(GLOAD_MASS = sum(GLOAD_MASS)) %>% ungroup() %>% dplyr::mutate(x = 1:n()) %>%
                dplyr::select(x, GLOAD_MASS), type='l', col = 'gray80')
}

saveRDS(Blist, file.path(WORK.OUT, 'BlistTexas.rds')) 



